1 Introduction

Here we will perform a quality control (QC) of the samples obtained from Newcastle.

2 Pre-processing

2.1 Load packages

library(Seurat)
library(scDblFinder)
library(tidyverse)
library(readxl)
library(ggpubr)
library(here)
library(glue)
library(DT)
library(harmony)

2.2 Parameters

# Functions
source(here::here("scRNA-seq/bin/utils.R"))


# Thresholds
min_lib_size_frozen <- 1000
min_lib_size_fresh <- 850
min_n_genes_frozen <- 450
min_n_genes_fresh <- 400
max_pct_mt <- 17.5
min_cells <- 5

2.3 Read data

meatadata_newcastle <- read_excel(
  here("data/Newcastle_samples/Tonsil_cell_atlas_metadata.xlsx"),
  col_names = TRUE
)
donor_metadata <- read_csv(here("data/tonsil_atlas_donor_metadata.csv"))
DT::datatable(meatadata_newcastle)
DT::datatable(donor_metadata)
dirs <- list.dirs(here("data/Newcastle_samples"), full.names = FALSE)
dirs <- str_subset(dirs, "Results")
counts_list <- map(dirs, \(x) {
  mat <- Read10X(here(glue("data/Newcastle_samples/{x}")))
  mat
})
names(counts_list) <- str_remove(dirs, "_Results")

2.4 Create Seurat object

seurat_list <- map(names(counts_list), \(x) {
  mat <- counts_list[[x]]
  seurat_obj <- CreateSeuratObject(counts = mat)
  seurat_obj$gem_id <- x
  seurat_obj$donor_id_newcastle <- str_sub(x, start = 1, end = 5)
  seurat_obj$preservation <- str_extract(x, "(fresh|frozen)")
  seurat_obj <- RenameCells(seurat_obj, add.cell.id = x)
  seurat_obj
})
names(seurat_list) <- names(counts_list)
seurat_list
## $`TIP01-1_fresh`
## An object of class Seurat 
## 36601 features across 14951 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
## 
## $`TIP01-1_frozen`
## An object of class Seurat 
## 36601 features across 5256 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
## 
## $`TIP01-2_frozen`
## An object of class Seurat 
## 36601 features across 13170 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
## 
## $`TIP02-1_fresh`
## An object of class Seurat 
## 36601 features across 13557 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
## 
## $`TIP02-1_frozen`
## An object of class Seurat 
## 36601 features across 9229 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
## 
## $`TIP02-2_fresh`
## An object of class Seurat 
## 36601 features across 15335 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
## 
## $`TIP02-2_frozen`
## An object of class Seurat 
## 36601 features across 10369 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
## 
## $`TIP03-1_fresh`
## An object of class Seurat 
## 36601 features across 14435 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
## 
## $`TIP03-1_frozen`
## An object of class Seurat 
## 36601 features across 5644 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
## 
## $`TIP03-2_fresh`
## An object of class Seurat 
## 36601 features across 16425 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
## 
## $`TIP03-2_frozen`
## An object of class Seurat 
## 36601 features across 16167 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
## 
## $`TIP04-1_fresh`
## An object of class Seurat 
## 36601 features across 5563 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
## 
## $`TIP04-1_frozen`
## An object of class Seurat 
## 36601 features across 5820 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
## 
## $`TIP04-2_fresh`
## An object of class Seurat 
## 36601 features across 3508 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
## 
## $`TIP04-2_frozen`
## An object of class Seurat 
## 36601 features across 5189 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
rm(counts_list)
gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  10927220  583.6   19619800 1047.9  19619800 1047.9
## Vcells 327967216 2502.2  578039883 4410.1 577247477 4404.1
seurat <- Reduce(merge, seurat_list)
seurat
## An object of class Seurat 
## 36601 features across 154618 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
rm(seurat_list)
gc()
##             used   (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells  10778545  575.7   19619800  1047.9   19619800  1047.9
## Vcells 633058392 4829.9 2110084061 16098.7 2624712661 20025.0
# Add donor metadata
donor_metadata <- donor_metadata[donor_metadata$hospital == "Newcastle", ]
donor_metadata$donor_id_newcastle <- str_remove(
  donor_metadata$comments,
  "original id: "
)
seurat$barcode <- colnames(seurat)
new_metadata <- left_join(
  seurat@meta.data,
  donor_metadata,
  by = "donor_id_newcastle"
)
rownames(new_metadata) <- new_metadata$barcode
seurat@meta.data <- new_metadata

3 Distribution of QC metrics across samples

# Number of detected genes
seurat@meta.data %>%
  ggplot(aes(gem_id, nFeature_RNA, fill = preservation)) +
    geom_boxplot(outlier.size = 0.1) +
    scale_y_continuous(
      limits = c(0, 8000),
      breaks = c(0, 1000, 2000, 3000, 5000, 6000, 7000, 8000)
    ) +
    ylab("Number of detected genes") +
    theme_classic() +
    theme(
      legend.title = element_blank(),
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))

# Percentage of mitochondrial expression
seurat$pct_mt <- PercentageFeatureSet(seurat, pattern = "^MT-")
seurat@meta.data %>%
  ggplot(aes(gem_id, pct_mt, fill = preservation)) +
    geom_boxplot(outlier.size = 0.1) +
    scale_y_continuous(limits = c(0, 100)) +
    ylab("% of mitochondrial expression") +
    theme_classic() +
    theme(
      legend.title = element_blank(),
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))

4 Filter cells

Library size:

seurat_list <- SplitObject(seurat, split.by = "preservation")
min_lib_sizes <- c(min_lib_size_fresh, min_lib_size_frozen)
names(min_lib_sizes) <- names(seurat_list)
for (x in names(seurat_list)) {
  print(x)
  lib_size_hist <- seurat_list[[x]]@meta.data %>%
  plot_histogram_qc(x = "nCount_RNA", x_lab = "Library Size (log10(total UMI))") +
  geom_vline(xintercept = min_lib_sizes[x], linetype = "dashed", color = "red")
  lib_size_hist1 <- lib_size_hist +
      scale_x_log10()
  lib_size_hist2 <- lib_size_hist +
      scale_x_continuous(limits = c(0, 4000)) +
      xlab("Library Size (total UMI)") +
      theme_pubr()
  lib_size_arranged <- lib_size_hist1 + lib_size_hist2
  print(lib_size_arranged)
}
## [1] "fresh"

## [1] "frozen"

Number of detected genes:

min_n_genes_l <- c(min_n_genes_fresh, min_n_genes_frozen)
names(min_n_genes_l) <- names(seurat_list)
for (x in names(seurat_list)) {
  print(x)
  n_genes_hist1 <- seurat_list[[x]]@meta.data %>%
    plot_histogram_qc(x = "nFeature_RNA", x_lab = "Number of Detected Genes") +
      geom_vline(xintercept = min_n_genes_l[x], linetype = "dashed", color = "red")
  n_genes_hist2 <- n_genes_hist1 +
    scale_x_continuous(limits = c(0, 2000)) 
  n_genes_hist_arranged <- n_genes_hist1 + n_genes_hist2
  print(n_genes_hist_arranged)
}
## [1] "fresh"

## [1] "frozen"

Percentage of mitochondrial expression:

for (x in names(seurat_list)) {
  print(x)
  pct_mt_hist <- seurat_list[[x]]@meta.data %>%
    plot_histogram_qc(x = "pct_mt", x_lab = "% Mitochondrial Expression") +
    geom_vline(xintercept = max_pct_mt, linetype = "dashed", color = "red") +
    scale_x_continuous(limits = c(0, 100))
  print(pct_mt_hist)
}
## [1] "fresh"

## [1] "frozen"

Joint QC metrics:

for (x in names(seurat_list)) {
  print(x)
  pct_mt_vs_lib_size <- FeatureScatter(
    seurat_list[[x]],
    feature1 = "nCount_RNA",
    feature2 = "pct_mt",
    pt.size = 0.15,
    cols = rep("black", length(levels(Idents(seurat_list[[x]]))))
  )
  pct_mt_vs_lib_size <- pct_mt_vs_lib_size +
    labs(x = "Library Size (total UMI)", y = "% Mitochondrial Expression") +
    theme(legend.position = "none", plot.title = element_blank()) +
    geom_vline(xintercept = min_lib_sizes[x], linetype = "dashed", color = "red") +
    geom_hline(yintercept = max_pct_mt, linetype = "dashed", color = "red")
  print(pct_mt_vs_lib_size)
}
## [1] "fresh"

## [1] "frozen"

Let us perform linear (PCA) and nonlinear (UMAP) dimensionality reduction to assess the batch effects between fresh and frozen samples:

rm(seurat_list)
gc()
##             used   (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells  10956388  585.2   19619800  1047.9   19619800  1047.9
## Vcells 636376873 4855.2 2110084061 16098.7 2624712661 20025.0
seurat <- seurat %>%
  NormalizeData() %>%
  FindVariableFeatures() %>%
  ScaleData() %>%
  RunPCA() %>%
  RunUMAP(dims = 1:30, reduction = "pca")
DimPlot(seurat, group.by = "preservation")

Indeed, preservation is the main source of batch effects.

Subset cells:

is_low_quality <- with(seurat@meta.data,
  (preservation == "fresh" & nCount_RNA < min_lib_size_fresh) |
  (preservation == "frozen" & nCount_RNA < min_lib_size_frozen) |
  (preservation == "fresh" & nFeature_RNA < min_n_genes_fresh) |
  (preservation == "frozen" & nFeature_RNA < min_n_genes_frozen) |
  (pct_mt > max_pct_mt)
)
table(is_low_quality)
## is_low_quality
##  FALSE   TRUE 
## 121395  33223
seurat <- seurat[, !is_low_quality]
gc()
##              used    (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells   11017963   588.5   19619800  1047.9   19619800  1047.9
## Vcells 1773028718 13527.2 3038697047 23183.5 2624712661 20025.0
DimPlot(seurat, group.by = "preservation")

We now assess if fresh and frozen can be integrated with Harmony:

seurat$preservation <- factor(seurat$preservation)
seurat <- seurat %>%
  FindVariableFeatures() %>%
  ScaleData() %>%
  RunPCA() %>%
  RunHarmony(group.by.vars = "preservation") %>%
  RunUMAP(dims = 1:30, reduction = "harmony")
DimPlot(
  seurat,
  group.by = "preservation",
  raster = FALSE,
  split.by = "preservation"
)

5 Filter out genes

According to Luecken MD et al.: “A guideline to setting this threshold is to use the minimum cell cluster size that is of interest and leaving some leeway for dropout effects. For example, filtering out genes expressed in fewer than 20 cells may make it difficult to detect cell clusters with fewer than 20 cells. For datasets with high dropout rates, this threshold may also complicate the detection of larger clusters. The choice of threshold should scale with the number of cells in the dataset and the intended downstream analysis.”

Since we want to detect rare cell types in the tonsil, we will be very permissive and retain genes that are expressed in at least 5 cells.

5.1 Set threshold

n_cells <- Matrix::rowSums(seurat[["RNA"]]@counts > 0)
gene_qc <- n_cells %>% 
  as.data.frame() %>% 
  ggplot(aes(n_cells)) + 
    geom_histogram(bins = 100, alpha = 0.75) +
    scale_x_log10("Number of cells") +
    theme_bw() 
gene_qc +
  geom_vline(xintercept = min_cells, linetype = "dashed", color = "red")

5.2 Plot genes with highest expression

top_50_genes <- sort(n_cells, decreasing = TRUE)[1:50]
top_50_genes_df <- data.frame(
  gene = names(top_50_genes),
  n_cells = top_50_genes
)
top_50_genes_df %>%
  ggplot(aes(fct_reorder(gene, n_cells), n_cells)) +
    geom_point() +
    labs(x = "", y = "Number of expressing cells") +
    coord_flip()

5.3 Filter

kept_genes <- rownames(seurat)[n_cells > min_cells]
table(n_cells > min_cells)
## 
## FALSE  TRUE 
## 10612 25989
(seurat <- subset(seurat, features = kept_genes))
## An object of class Seurat 
## 25989 features across 121395 samples within 1 assay 
## Active assay: RNA (25989 features, 1984 variable features)
##  3 dimensional reductions calculated: pca, umap, harmony

6 Doublet detection

7 Doublets

We use scDblFinder:

sce <- as.SingleCellExperiment(seurat)
dbl_densities <- computeDoubletDensity(
  sce,
  dims = 30,
  subset.row = VariableFeatures(seurat)
)
summary(dbl_densities)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1200  0.3800  0.9474  0.9600 38.4200
seurat$doublet_score <- dbl_densities
FeaturePlot(
  seurat,
  features = "doublet_score",
  raster = FALSE
) &
  scale_color_viridis_c(option = "magma")

FeaturePlot(
  seurat,
  features = c("CD79A", "CD3D"),
  raster = FALSE
) &
  scale_color_viridis_c(option = "magma")

FeatureScatter(
  seurat,
  feature1 = "CD79A",
  feature2 = "CD3D",
  group.by = "preservation"
) +
  geom_vline(xintercept = 1) +
  geom_hline(yintercept = 1)

seurat$is_doublet_B_T <- ifelse(
  ((seurat[["RNA"]]@data["CD79A", ] > 1) & (seurat[["RNA"]]@data["CD3D", ] > 1)),
  "doublet_B_T",
  "not_doublet_B_T"
)
DimPlot(seurat, group.by = "is_doublet_B_T", raster = FALSE)

seurat@meta.data %>%
  ggplot(aes(doublet_score)) +
    geom_histogram(bins = 100) +
    theme_pubr()

seurat$is_doublet <- seurat$doublet_score > 5
DimPlot(seurat, group.by = "is_doublet", raster = FALSE)

(seurat <- seurat[, !seurat$is_doublet])
## An object of class Seurat 
## 25989 features across 117565 samples within 1 assay 
## Active assay: RNA (25989 features, 1984 variable features)
##  3 dimensional reductions calculated: pca, umap, harmony

Let’s assess the number of cells after QC, colored by preservation strategy (fresh/frozen):

seurat@meta.data %>%
  group_by(gem_id, preservation) %>%
  summarize(n_cells = n()) %>%
  ggplot(aes(fct_reorder(gem_id, n_cells), n_cells, fill = preservation)) +
    geom_col() +
    geom_text(aes(label = n_cells), hjust = -0.1) +
    ylab("Number of cells") +
    theme_pubr() +
    theme(axis.title.y = element_blank()) +
    coord_flip()

seurat@meta.data %>%
  ggplot(aes(gem_id, nFeature_RNA, fill = preservation)) +
    geom_boxplot(outlier.size = 0.1) +
    scale_y_continuous(
      limits = c(0, 8000),
      breaks = c(0, 1000, 2000, 3000, 5000, 6000, 7000, 8000)
    ) +
    ylab("Number of detected genes") +
    theme_classic() +
    theme(
      legend.title = element_blank(),
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))

8 UPDATE

Downstream in the analysis we detected that cyropreserved (frozen) cells displayed biased proportions as compared with the freshly processed ones. Thus, we will exclude them from the atlas, as they are unreliable:

seurat
## An object of class Seurat 
## 25989 features across 117565 samples within 1 assay 
## Active assay: RNA (25989 features, 1984 variable features)
##  3 dimensional reductions calculated: pca, umap, harmony
(seurat <- seurat[, seurat$preservation == "fresh"])
## An object of class Seurat 
## 25989 features across 58028 samples within 1 assay 
## Active assay: RNA (25989 features, 1984 variable features)
##  3 dimensional reductions calculated: pca, umap, harmony

9 Save

dir.create(here("scRNA-seq/results/R_objects/seurat_objects_newcastle"))
saveRDS(seurat, here("scRNA-seq/results/R_objects/seurat_objects_newcastle/1-seurat_object_newcastle_filtered.rds"))

10 Session Information

sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /home/groups/singlecell/rmassoni/anaconda3/envs/richter/lib/libopenblasp-r0.3.21.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=es_ES.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=es_ES.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=es_ES.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] harmony_0.1.1               Rcpp_1.0.10                 DT_0.27                     glue_1.6.2                  here_1.0.1                  ggpubr_0.6.0                readxl_1.4.2                forcats_1.0.0               stringr_1.5.0               dplyr_1.1.0                 purrr_1.0.1                 readr_2.1.4                 tidyr_1.3.0                 tibble_3.1.8                ggplot2_3.4.1               tidyverse_1.3.2             scDblFinder_1.12.0          SingleCellExperiment_1.20.0 SummarizedExperiment_1.28.0 Biobase_2.58.0              GenomicRanges_1.50.0        GenomeInfoDb_1.34.9         IRanges_2.32.0              S4Vectors_0.36.0            BiocGenerics_0.44.0         MatrixGenerics_1.10.0       matrixStats_0.63.0          SeuratObject_4.1.3          Seurat_4.3.0                BiocStyle_2.26.0           
## 
## loaded via a namespace (and not attached):
##   [1] rtracklayer_1.58.0        scattermore_0.8           R.methodsS3_1.8.2         bit64_4.0.5               knitr_1.42                R.utils_2.12.2            irlba_2.3.5.1             DelayedArray_0.24.0       data.table_1.14.6         RCurl_1.98-1.10           generics_0.1.3            ScaledMatrix_1.6.0        cowplot_1.1.1             RANN_2.6.1                future_1.31.0             bit_4.0.5                 tzdb_0.3.0                spatstat.data_3.0-0       xml2_1.3.3                lubridate_1.9.1           httpuv_1.6.9              assertthat_0.2.1          viridis_0.6.2             gargle_1.3.0              xfun_0.37                 hms_1.1.2                 jquerylib_0.1.4           evaluate_0.20             promises_1.2.0.1          fansi_1.0.4               restfulr_0.0.15           dbplyr_2.3.0              igraph_1.3.5              DBI_1.1.3                 htmlwidgets_1.6.1         spatstat.geom_3.0-6       googledrive_2.0.0         ellipsis_0.3.2            crosstalk_1.2.0           backports_1.4.1           bookdown_0.33             deldir_1.0-6              sparseMatrixStats_1.10.0  vctrs_0.5.2               ROCR_1.0-11              
##  [46] abind_1.4-5               cachem_1.0.6              withr_2.5.0               progressr_0.13.0          vroom_1.6.1               sctransform_0.3.5         GenomicAlignments_1.34.0  scran_1.26.2              goftest_1.2-3             cluster_2.1.4             lazyeval_0.2.2            crayon_1.5.2              spatstat.explore_3.0-6    labeling_0.4.2            edgeR_3.40.2              pkgconfig_2.0.3           nlme_3.1-162              vipor_0.4.5               rlang_1.0.6               globals_0.16.2            lifecycle_1.0.3           miniUI_0.1.1.1            modelr_0.1.10             rsvd_1.0.5                cellranger_1.1.0          rprojroot_2.0.3           polyclip_1.10-4           lmtest_0.9-40             Matrix_1.5-3              carData_3.0-5             zoo_1.8-11                reprex_2.0.2              beeswarm_0.4.0            ggridges_0.5.4            googlesheets4_1.0.1       png_0.1-8                 viridisLite_0.4.1         rjson_0.2.21              bitops_1.0-7              R.oo_1.25.0               KernSmooth_2.23-20        Biostrings_2.66.0         DelayedMatrixStats_1.20.0 parallelly_1.34.0         spatstat.random_3.1-3    
##  [91] rstatix_0.7.2             ggsignif_0.6.4            beachmat_2.14.2           scales_1.2.1              magrittr_2.0.3            plyr_1.8.8                ica_1.0-3                 zlibbioc_1.44.0           compiler_4.2.2            dqrng_0.3.0               BiocIO_1.8.0              RColorBrewer_1.1-3        fitdistrplus_1.1-8        Rsamtools_2.14.0          cli_3.6.0                 XVector_0.38.0            listenv_0.9.0             patchwork_1.1.2           pbapply_1.7-0             MASS_7.3-58.2             tidyselect_1.2.0          stringi_1.7.12            highr_0.10                yaml_2.3.7                BiocSingular_1.14.0       locfit_1.5-9.7            ggrepel_0.9.3             grid_4.2.2                sass_0.4.5                tools_4.2.2               timechange_0.2.0          future.apply_1.10.0       parallel_4.2.2            bluster_1.8.0             metapod_1.6.0             gridExtra_2.3             farver_2.1.1              Rtsne_0.16                digest_0.6.31             BiocManager_1.30.19       shiny_1.7.4               car_3.1-2                 broom_1.0.3               scuttle_1.8.4             later_1.3.0              
## [136] RcppAnnoy_0.0.20          httr_1.4.4                colorspace_2.1-0          rvest_1.0.3               XML_3.99-0.13             fs_1.6.1                  tensor_1.5                reticulate_1.26           splines_4.2.2             uwot_0.1.14               statmod_1.5.0             spatstat.utils_3.0-1      scater_1.26.1             sp_1.6-0                  xgboost_1.7.5.1           plotly_4.10.1             xtable_1.8-4              jsonlite_1.8.4            R6_2.5.1                  pillar_1.8.1              htmltools_0.5.4           mime_0.12                 fastmap_1.1.0             BiocParallel_1.32.5       BiocNeighbors_1.16.0      codetools_0.2-19          utf8_1.2.3                lattice_0.20-45           bslib_0.4.2               spatstat.sparse_3.0-0     ggbeeswarm_0.7.1          leiden_0.4.3              survival_3.5-3            limma_3.54.0              rmarkdown_2.20            munsell_0.5.0             GenomeInfoDbData_1.2.9    haven_2.5.1               reshape2_1.4.4            gtable_0.3.1